# Datasets available at:
# UCI, Machine Learning Repository: http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# Data Folder: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
# 1. White wine: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
# 2. Red Wine: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
# Importing: Pandas, NumPy, Matplotlib, Seaborn and Scikit-Learn libraries.
import pandas as pd
import numpy as ny
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
#
# Importing both the white wine and red wine datasets.
white_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')
red_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
white_wine.tail()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 6 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 5 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 6 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 7 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 6 |
print(white_wine.shape)
white_wine.info()
white_wine.describe()
(4898, 12) <class 'pandas.core.frame.DataFrame'> RangeIndex: 4898 entries, 0 to 4897 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 4898 non-null float64 1 volatile acidity 4898 non-null float64 2 citric acid 4898 non-null float64 3 residual sugar 4898 non-null float64 4 chlorides 4898 non-null float64 5 free sulfur dioxide 4898 non-null float64 6 total sulfur dioxide 4898 non-null float64 7 density 4898 non-null float64 8 pH 4898 non-null float64 9 sulphates 4898 non-null float64 10 alcohol 4898 non-null float64 11 quality 4898 non-null int64 dtypes: float64(11), int64(1) memory usage: 459.3 KB
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
red_wine.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
red_wine.tail()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1594 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 |
| 1595 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 |
| 1596 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
| 1597 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 |
| 1598 | 6.0 | 0.310 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 | 6 |
print(red_wine.shape)
red_wine.info()
red_wine.describe()
(1599, 12) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
# There are a total of 4898 samples of the white wine variant and 1599 samples of the red wine variant.
# Both the red and white wine samples are qualified by a set of eleven physicochemical features.
# These physicochemical attributes are: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides,
# free sulfur dioxide, total sulfur dioxide, density, pH, sulphates and alcohol content.
# All these eleven attributes would be continuously distributed and be of the ‘float’ datatype.
#
# The twelfth attribute, the quality attribute, can be considered as a sensory information indicator,
# indicating a quality score between 0 and 10 and would be of the ‘integer’ datatype.
# This quality attribute is our attribute of interest, being attempted to classify and predict.
wine_data_all = pd.concat([white_wine, red_wine], axis=0)
print(wine_data_all.shape)
wine_data_all.info()
wine_data_all.describe()
(6497, 12) <class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null int64 dtypes: float64(11), int64(1) memory usage: 659.9 KB
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 7.215307 | 0.339666 | 0.318633 | 5.443235 | 0.056034 | 30.525319 | 115.744574 | 0.994697 | 3.218501 | 0.531268 | 10.491801 | 5.818378 |
| std | 1.296434 | 0.164636 | 0.145318 | 4.757804 | 0.035034 | 17.749400 | 56.521855 | 0.002999 | 0.160787 | 0.148806 | 1.192712 | 0.873255 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.000000 | 0.992340 | 3.110000 | 0.430000 | 9.500000 | 5.000000 |
| 50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994890 | 3.210000 | 0.510000 | 10.300000 | 6.000000 |
| 75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 156.000000 | 0.996990 | 3.320000 | 0.600000 | 11.300000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 | 9.000000 |
print(wine_data_all)
fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.270 0.36 20.7 0.045
1 6.3 0.300 0.34 1.6 0.049
2 8.1 0.280 0.40 6.9 0.050
3 7.2 0.230 0.32 8.5 0.058
4 7.2 0.230 0.32 8.5 0.058
... ... ... ... ... ...
1594 6.2 0.600 0.08 2.0 0.090
1595 5.9 0.550 0.10 2.2 0.062
1596 6.3 0.510 0.13 2.3 0.076
1597 5.9 0.645 0.12 2.0 0.075
1598 6.0 0.310 0.47 3.6 0.067
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 45.0 170.0 1.00100 3.00 0.45
1 14.0 132.0 0.99400 3.30 0.49
2 30.0 97.0 0.99510 3.26 0.44
3 47.0 186.0 0.99560 3.19 0.40
4 47.0 186.0 0.99560 3.19 0.40
... ... ... ... ... ...
1594 32.0 44.0 0.99490 3.45 0.58
1595 39.0 51.0 0.99512 3.52 0.76
1596 29.0 40.0 0.99574 3.42 0.75
1597 32.0 44.0 0.99547 3.57 0.71
1598 18.0 42.0 0.99549 3.39 0.66
alcohol quality
0 8.8 6
1 9.5 6
2 10.1 6
3 9.9 6
4 9.9 6
... ... ...
1594 10.5 5
1595 11.2 6
1596 11.0 6
1597 10.2 5
1598 11.0 6
[6497 rows x 12 columns]
# This wine_data_all is the consolidation of the white wine (4898 samples) and red wine (1599 samples) samples,
# and this consolidated dataset of 4898 + 1599 = 6497 entries would be the sample set, used for our analysis.
# Check for missing values.
wine_data_all.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
# This consolidated dataset would have no missing values.
# Distribution of the eleven physicochemical features, namely: fixed acidity, volatile acidity, citric acid, residual sugar,
# chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates and alcohol content.
plt.hist(wine_data_all['fixed acidity'])
plt.xlabel("Fixed Acidity")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['volatile acidity'])
plt.xlabel("Volatile Acidity")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['citric acid'])
plt.xlabel("Citric Acid")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['residual sugar'])
plt.xlabel("Residual Sugar")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['chlorides'])
plt.xlabel("Chlorides")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['free sulfur dioxide'])
plt.xlabel("Free Sulfur Dioxide")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['total sulfur dioxide'])
plt.xlabel("Total Sulfur Dioxide")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['density'])
plt.xlabel("Density")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['pH'])
plt.xlabel("pH")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['sulphates'])
plt.xlabel("Sulphates")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['alcohol'])
plt.xlabel("Alcohol Content")
plt.ylabel("Frequency distribution")
plt.show()
# Distribution of all the attributes in the consolidated dataset.
wine_data_all.hist(bins=25, figsize=(20, 20))
plt.show()
# We can see that all the attributes are following different distributions, also have different ranges and scale.
# We can also see that there would be no outliers, which would need actioning.
wine_data_all.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 7.215307 | 0.339666 | 0.318633 | 5.443235 | 0.056034 | 30.525319 | 115.744574 | 0.994697 | 3.218501 | 0.531268 | 10.491801 | 5.818378 |
| std | 1.296434 | 0.164636 | 0.145318 | 4.757804 | 0.035034 | 17.749400 | 56.521855 | 0.002999 | 0.160787 | 0.148806 | 1.192712 | 0.873255 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.000000 | 0.992340 | 3.110000 | 0.430000 | 9.500000 | 5.000000 |
| 50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994890 | 3.210000 | 0.510000 | 10.300000 | 6.000000 |
| 75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 156.000000 | 0.996990 | 3.320000 | 0.600000 | 11.300000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 | 9.000000 |
# Distribution and details of our attribute of interest, the quality attribute.
plt.hist(wine_data_all.quality)
plt.xlabel("Quality")
plt.ylabel("Frequency distribution of wine quality")
plt.show()
wine_data_all.quality.describe()
print(wine_data_all['quality'].value_counts())
6 2836 5 2138 7 1079 4 216 8 193 3 30 9 5 Name: quality, dtype: int64
# It’s interesting to note that 6053 (2836 + 2138 + 1079) entries out of the total 6497 entries,
# about 93%, would have a quality rating of either 5 / 6 or 7.
wine_data_all.quality.describe()
count 6497.000000 mean 5.818378 std 0.873255 min 3.000000 25% 5.000000 50% 6.000000 75% 6.000000 max 9.000000 Name: quality, dtype: float64
# Check for correlations between the attributes
sns.heatmap(wine_data_all.corr(), cmap='coolwarm')
<AxesSubplot:>
wine_data_all.corr()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.000000 | 0.219008 | 0.324436 | -0.111981 | 0.298195 | -0.282735 | -0.329054 | 0.458910 | -0.252700 | 0.299568 | -0.095452 | -0.076743 |
| volatile acidity | 0.219008 | 1.000000 | -0.377981 | -0.196011 | 0.377124 | -0.352557 | -0.414476 | 0.271296 | 0.261454 | 0.225984 | -0.037640 | -0.265699 |
| citric acid | 0.324436 | -0.377981 | 1.000000 | 0.142451 | 0.038998 | 0.133126 | 0.195242 | 0.096154 | -0.329808 | 0.056197 | -0.010493 | 0.085532 |
| residual sugar | -0.111981 | -0.196011 | 0.142451 | 1.000000 | -0.128940 | 0.402871 | 0.495482 | 0.552517 | -0.267320 | -0.185927 | -0.359415 | -0.036980 |
| chlorides | 0.298195 | 0.377124 | 0.038998 | -0.128940 | 1.000000 | -0.195045 | -0.279630 | 0.362615 | 0.044708 | 0.395593 | -0.256916 | -0.200666 |
| free sulfur dioxide | -0.282735 | -0.352557 | 0.133126 | 0.402871 | -0.195045 | 1.000000 | 0.720934 | 0.025717 | -0.145854 | -0.188457 | -0.179838 | 0.055463 |
| total sulfur dioxide | -0.329054 | -0.414476 | 0.195242 | 0.495482 | -0.279630 | 0.720934 | 1.000000 | 0.032395 | -0.238413 | -0.275727 | -0.265740 | -0.041385 |
| density | 0.458910 | 0.271296 | 0.096154 | 0.552517 | 0.362615 | 0.025717 | 0.032395 | 1.000000 | 0.011686 | 0.259478 | -0.686745 | -0.305858 |
| pH | -0.252700 | 0.261454 | -0.329808 | -0.267320 | 0.044708 | -0.145854 | -0.238413 | 0.011686 | 1.000000 | 0.192123 | 0.121248 | 0.019506 |
| sulphates | 0.299568 | 0.225984 | 0.056197 | -0.185927 | 0.395593 | -0.188457 | -0.275727 | 0.259478 | 0.192123 | 1.000000 | -0.003029 | 0.038485 |
| alcohol | -0.095452 | -0.037640 | -0.010493 | -0.359415 | -0.256916 | -0.179838 | -0.265740 | -0.686745 | 0.121248 | -0.003029 | 1.000000 | 0.444319 |
| quality | -0.076743 | -0.265699 | 0.085532 | -0.036980 | -0.200666 | 0.055463 | -0.041385 | -0.305858 | 0.019506 | 0.038485 | 0.444319 | 1.000000 |
# With respect to the quality attribute, strongest correlations seen between:
# quality / alcohol (0.444319: positively correlated)
# quality / density (-0.305858: negatively correlated)
# quality / volatile acidity (-0.265699: negatively correlated)
# quality / chlorides (-0.200666: negatively correlated) pairs.
#
# As regards the other set of attributes, strongest correlations seen between:
# free sulfur dioxide / total sulfur dioxide (0.720934: positively correlated)
# alcohol / density (-0.686745: negatively correlated)
# density / residual sugar (0.552517: positively correlated)
# residual sugar / total sulfur dioxide (0.495482: positively correlated)
# density / fixed acidity (0.458910: positively correlated) pairs.
# Creating four distinct quality labels: low, medium, high and excellent
# These four levels would be:
# Level 1: Low (for quality attribute values: 0, 1 and 2)
# Level 2: Medium (for quality attribute values: 3, 4 and 5)
# Level 3: High (for quality attribute values: 6, 7 and 8)
# Level 4: Excellent (for quality attribute values: 9 and 10)
wine_data_all_4levels = wine_data_all.copy()
bins = [0, 2.5, 5.5, 8.5, 10]
labels = [1, 2, 3, 4]
wine_data_all_4levels['quality'] = pd.cut(wine_data_all_4levels['quality'], bins=bins, labels=labels)
print(wine_data_all_4levels['quality'].value_counts())
wine_data_all_4levels.quality.describe()
3 4108 2 2384 4 5 1 0 Name: quality, dtype: int64
count 6497 unique 3 top 3 freq 4108 Name: quality, dtype: int64
wine_data_all_n = wine_data_all_4levels.copy()
wine_data_all_n.info()
wine_data_all_n['quality']
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null category dtypes: category(1), float64(11) memory usage: 615.6 KB
0 3
1 3
2 3
3 3
4 3
..
1594 2
1595 3
1596 3
1597 2
1598 3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
wine_data_all_n.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 7.215307 | 0.339666 | 0.318633 | 5.443235 | 0.056034 | 30.525319 | 115.744574 | 0.994697 | 3.218501 | 0.531268 | 10.491801 |
| std | 1.296434 | 0.164636 | 0.145318 | 4.757804 | 0.035034 | 17.749400 | 56.521855 | 0.002999 | 0.160787 | 0.148806 | 1.192712 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 |
| 25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.000000 | 0.992340 | 3.110000 | 0.430000 | 9.500000 |
| 50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994890 | 3.210000 | 0.510000 | 10.300000 |
| 75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 156.000000 | 0.996990 | 3.320000 | 0.600000 | 11.300000 |
| max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 |
# Feature scaling, using Min-Max normalization technique (Normalizing numeric attributes)
#
def normalize(x):
return ((x - min(x)) / (max(x) - min(x)))
#
#
X = list(set(list(wine_data_all_4levels)) - set(['quality']))
#
wine_data_all_n = wine_data_all_4levels.iloc[:,].copy()
wine_data_all_n[X] = wine_data_all_n[X].apply(normalize)
#
print(wine_data_all_n.shape)
wine_data_all_n.info()
wine_data_all_n.describe()
#
(6497, 12) <class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null category dtypes: category(1), float64(11) memory usage: 615.6 KB
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 0.282257 | 0.173111 | 0.191948 | 0.074283 | 0.078129 | 0.102518 | 0.252868 | 0.146262 | 0.386435 | 0.174870 | 0.361131 |
| std | 0.107143 | 0.109758 | 0.087541 | 0.072972 | 0.058195 | 0.061630 | 0.130235 | 0.057811 | 0.124641 | 0.083599 | 0.172857 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.214876 | 0.100000 | 0.150602 | 0.018405 | 0.048173 | 0.055556 | 0.163594 | 0.100829 | 0.302326 | 0.117978 | 0.217391 |
| 50% | 0.264463 | 0.140000 | 0.186747 | 0.036810 | 0.063123 | 0.097222 | 0.258065 | 0.149990 | 0.379845 | 0.162921 | 0.333333 |
| 75% | 0.322314 | 0.213333 | 0.234940 | 0.115031 | 0.093023 | 0.138889 | 0.345622 | 0.190476 | 0.465116 | 0.213483 | 0.478261 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
wine_data_all_n
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.264463 | 0.126667 | 0.216867 | 0.308282 | 0.059801 | 0.152778 | 0.377880 | 0.267785 | 0.217054 | 0.129213 | 0.115942 | 3 |
| 1 | 0.206612 | 0.146667 | 0.204819 | 0.015337 | 0.066445 | 0.045139 | 0.290323 | 0.132832 | 0.449612 | 0.151685 | 0.217391 | 3 |
| 2 | 0.355372 | 0.133333 | 0.240964 | 0.096626 | 0.068106 | 0.100694 | 0.209677 | 0.154039 | 0.418605 | 0.123596 | 0.304348 | 3 |
| 3 | 0.280992 | 0.100000 | 0.192771 | 0.121166 | 0.081395 | 0.159722 | 0.414747 | 0.163678 | 0.364341 | 0.101124 | 0.275362 | 3 |
| 4 | 0.280992 | 0.100000 | 0.192771 | 0.121166 | 0.081395 | 0.159722 | 0.414747 | 0.163678 | 0.364341 | 0.101124 | 0.275362 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | 0.198347 | 0.346667 | 0.048193 | 0.021472 | 0.134551 | 0.107639 | 0.087558 | 0.150183 | 0.565891 | 0.202247 | 0.362319 | 2 |
| 1595 | 0.173554 | 0.313333 | 0.060241 | 0.024540 | 0.088040 | 0.131944 | 0.103687 | 0.154425 | 0.620155 | 0.303371 | 0.463768 | 3 |
| 1596 | 0.206612 | 0.286667 | 0.078313 | 0.026074 | 0.111296 | 0.097222 | 0.078341 | 0.166377 | 0.542636 | 0.297753 | 0.434783 | 3 |
| 1597 | 0.173554 | 0.376667 | 0.072289 | 0.021472 | 0.109635 | 0.107639 | 0.087558 | 0.161172 | 0.658915 | 0.275281 | 0.318841 | 2 |
| 1598 | 0.181818 | 0.153333 | 0.283133 | 0.046012 | 0.096346 | 0.059028 | 0.082949 | 0.161558 | 0.519380 | 0.247191 | 0.434783 | 3 |
6497 rows × 12 columns
# Removing the quality attribute from the dataframe
wine_data_all_n_fs = wine_data_all_n.copy()
wine_data_all_n_fs.info()
X = wine_data_all_n_fs[wine_data_all_n_fs.columns[0:-1]]
y = wine_data_all_n_fs[wine_data_all_n_fs.columns[-1]]
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null category dtypes: category(1), float64(11) memory usage: 615.6 KB
y.describe()
y
0 3
1 3
2 3
3 3
4 3
..
1594 2
1595 3
1596 3
1597 2
1598 3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
from sklearn.feature_selection import SelectKBest, f_classif
subset_features = SelectKBest(score_func=f_classif, k='all')
subset_features.fit(X, y)
X_train_ = subset_features.transform(X)
X_test_ = subset_features.transform(X)
#
# Analysing the physicochemical features that would be impacting the quality attribute the most and
# perfoming feature selection by using the 'analysis of variance', ANOVA method.
#
# Removing the quality attribute from the dataframe
wine_data_all_n_fs = wine_data_all_n.copy()
# wine_data_all_n_fs.info()
X = wine_data_all_n_fs[wine_data_all_n_fs.columns[0:-1]]
y = wine_data_all_n_fs[wine_data_all_n_fs.columns[-1]]
X.info()
# y.info()
#
import plotly.express as px
from sklearn.feature_selection import SelectKBest, f_classif
subset_features = SelectKBest(score_func=f_classif, k='all')
subset_features.fit(X, y)
X_train_ = subset_features.transform(X)
X_test_ = subset_features.transform(X)
subset_features_names = pd.DataFrame()
subset_features_names['Features'] = subset_features.feature_names_in_
subset_features_names['Score'] = subset_features.scores_
subset_features_names.sort_values(by = 'Score' , ascending = False , inplace = True)
fig = px.histogram(subset_features_names, x='Features', y='Score', text_auto=True, color='Features', title='Scores:', template='simple_white')
fig.show()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 dtypes: float64(11) memory usage: 609.1 KB
# Based on the above scores, we see that the 'alcohol', 'density', 'volatile acidity' and 'chlorides' attributes would be
# impacting quality the most. These results are also consistent with the results of the correlation matrix.
# The other interesting thing, is that the scores of the other seven physicochemical features,
# would be very low as compared to these four attributes, and we can omit the lower score features and
# use a subset of these four high score features for our predictive model building tasks.
#
# Dropping the lower score features from the dataset and only keeping the four high score features namely:
# 'alcohol', 'density', 'volatile acidity' and 'chlorides' for our analysis.
wine_data_all_n_selected_subset_of_attributes = wine_data_all_n.drop(['fixed acidity', 'citric acid', 'residual sugar',
'free sulfur dioxide', 'total sulfur dioxide',
'pH', 'sulphates'], axis = 1)
wine_data_all_n_selected_subset_of_attributes.info()
wine_data_all_n_selected_subset_of_attributes.describe()
#
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 volatile acidity 6497 non-null float64 1 chlorides 6497 non-null float64 2 density 6497 non-null float64 3 alcohol 6497 non-null float64 4 quality 6497 non-null category dtypes: category(1), float64(4) memory usage: 260.3 KB
| volatile acidity | chlorides | density | alcohol | |
|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 0.173111 | 0.078129 | 0.146262 | 0.361131 |
| std | 0.109758 | 0.058195 | 0.057811 | 0.172857 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.100000 | 0.048173 | 0.100829 | 0.217391 |
| 50% | 0.140000 | 0.063123 | 0.149990 | 0.333333 |
| 75% | 0.213333 | 0.093023 | 0.190476 | 0.478261 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
#
wine_data_all_n_selected_subset_of_attributes.quality
#
0 3
1 3
2 3
3 3
4 3
..
1594 2
1595 3
1596 3
1597 2
1598 3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
wine_data_all_n_selected_subset_of_attributes.info()
wine_data_all_n_selected_subset_of_attributes.describe()
wine_data_all_n.info()
wine_data_all_n.describe()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 volatile acidity 6497 non-null float64 1 chlorides 6497 non-null float64 2 density 6497 non-null float64 3 alcohol 6497 non-null float64 4 quality 6497 non-null category dtypes: category(1), float64(4) memory usage: 260.3 KB <class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null category dtypes: category(1), float64(11) memory usage: 615.6 KB
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
| mean | 0.282257 | 0.173111 | 0.191948 | 0.074283 | 0.078129 | 0.102518 | 0.252868 | 0.146262 | 0.386435 | 0.174870 | 0.361131 |
| std | 0.107143 | 0.109758 | 0.087541 | 0.072972 | 0.058195 | 0.061630 | 0.130235 | 0.057811 | 0.124641 | 0.083599 | 0.172857 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.214876 | 0.100000 | 0.150602 | 0.018405 | 0.048173 | 0.055556 | 0.163594 | 0.100829 | 0.302326 | 0.117978 | 0.217391 |
| 50% | 0.264463 | 0.140000 | 0.186747 | 0.036810 | 0.063123 | 0.097222 | 0.258065 | 0.149990 | 0.379845 | 0.162921 | 0.333333 |
| 75% | 0.322314 | 0.213333 | 0.234940 | 0.115031 | 0.093023 | 0.138889 | 0.345622 | 0.190476 | 0.465116 | 0.213483 | 0.478261 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# Removing the quality attribute from the dataframe
# For dataset having all features
x_all = wine_data_all_n[wine_data_all_n.columns[0:-1]]
y_all = wine_data_all_n[wine_data_all_n.columns[-1]]
# x_all.info()
# x_all.describe()
# x_all
# y_all.info()
# y_all.describe()
# y_all
# For dataset having only the four selected features
x_all_sf = wine_data_all_n_selected_subset_of_attributes[wine_data_all_n_selected_subset_of_attributes.columns[0:-1]]
y_all_sf = wine_data_all_n_selected_subset_of_attributes[wine_data_all_n_selected_subset_of_attributes.columns[-1]]
# x_all_sf.info()
# x_all_sf.describe()
# x_all_sf
# y_all_sf.info()
# y_all_sf.describe()
# y_all_sf
# Dividing the dataset into training and testing sets
from sklearn.model_selection import train_test_split
# 80:20 ; train:test split
# For dataset having all features
x_all_train, x_all_test, y_all_train, y_all_test = train_test_split(x_all, y_all, test_size=.2, random_state=41)
# For dataset having only the four selected features
x_all_sf_train, x_all_sf_test, y_all_sf_train, y_all_sf_test = train_test_split(x_all_sf, y_all_sf, test_size=.2, random_state=41)
for data in [y_all_train, y_all_test]:
print(data.describe())
for data in [y_all_sf_train, y_all_sf_test]:
print(data.describe())
#
import warnings
warnings.filterwarnings("ignore")
#
count 5197 unique 3 top 3 freq 3291 Name: quality, dtype: int64 count 1300 unique 3 top 3 freq 817 Name: quality, dtype: int64 count 5197 unique 3 top 3 freq 3291 Name: quality, dtype: int64 count 1300 unique 3 top 3 freq 817 Name: quality, dtype: int64
# Decision Tree implementation: using all features
#
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
#
# Finding the best combination of parameters to use:
model_dt_all = DecisionTreeClassifier(random_state = 111)
params_ = {"max_depth": range(1,11), "max_features": range(1,21), "criterion": ["gini", "entropy"]}
dt_all_p = GridSearchCV(model_dt_all, params_, cv=4)
dt_all_p.fit(x_all_train,y_all_train)
print(dt_all_p.best_params_)
{'criterion': 'gini', 'max_depth': 9, 'max_features': 8}
# Decision Tree implementation: using all features
# Using the above combination of parameters in our implementation
dt_all = DecisionTreeClassifier(criterion = 'gini', max_depth = 9, max_features = 8, random_state = 111)
dt_all.fit(x_all_train, y_all_train)
pred_dt_all = dt_all.predict(x_all_test)
print(classification_report(y_all_test, pred_dt_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_dt_all))
cross_val_dt_all = cross_val_score(estimator=dt_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_dt_all.mean())
#
precision recall f1-score support
2 0.70 0.65 0.68 482
3 0.80 0.84 0.82 817
4 0.00 0.00 0.00 1
accuracy 0.77 1300
macro avg 0.50 0.50 0.50 1300
weighted avg 0.77 0.77 0.77 1300
Accuracy: 0.7684615384615384
Cross Validation Score: 0.7504321371469178
# Decision Tree implementation: using the subset of four selected features
#
# Finding the best combination of parameters to use:
model_dt_sf = DecisionTreeClassifier(random_state = 111)
params_ = {"max_depth": range(1,11), "max_features": range(1,21), "criterion": ["gini", "entropy"]}
dt_sf_p = GridSearchCV(model_dt_sf, params_, cv=4)
dt_sf_p.fit(x_all_sf_train,y_all_sf_train)
print(dt_sf_p.best_params_)
#
{'criterion': 'gini', 'max_depth': 6, 'max_features': 2}
# Decision Tree implementation: using the subset of four selected features
# Using the above combination of parameters in our implementation
dt_sf = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, max_features = 2, random_state = 111)
dt_sf.fit(x_all_sf_train, y_all_sf_train)
pred_dt_sf = dt_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_dt_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_dt_sf))
cross_val_dt_sf = cross_val_score(estimator=dt_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_dt_sf.mean())
#
precision recall f1-score support
2 0.70 0.59 0.64 482
3 0.78 0.85 0.81 817
4 0.00 0.00 0.00 1
accuracy 0.75 1300
macro avg 0.49 0.48 0.48 1300
weighted avg 0.75 0.75 0.75 1300
Accuracy: 0.7538461538461538
Cross Validation Score: 0.7383126073310831
# Random Forest implementation: using all features
#
from sklearn.ensemble import RandomForestClassifier
#
rf_all = RandomForestClassifier(random_state = 111)
rf_all.fit(x_all_train,y_all_train)
pred_rf_all = rf_all.predict(x_all_test)
print(classification_report(y_all_test, pred_rf_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_rf_all))
cross_val_rf_all = cross_val_score(estimator=rf_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_rf_all.mean())
#
precision recall f1-score support
2 0.80 0.72 0.76 482
3 0.84 0.89 0.87 817
4 0.00 0.00 0.00 1
accuracy 0.83 1300
macro avg 0.55 0.54 0.54 1300
weighted avg 0.83 0.83 0.83 1300
Accuracy: 0.8284615384615385
Cross Validation Score: 0.8195111624326405
# Random Forest implementation: using the subset of four selected features
#
rf_sf = RandomForestClassifier(random_state = 111)
rf_sf.fit(x_all_sf_train,y_all_sf_train)
pred_rf_sf = rf_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_rf_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_rf_sf))
cross_val_rf_sf = cross_val_score(estimator=rf_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_rf_sf.mean())
#
precision recall f1-score support
2 0.78 0.69 0.73 482
3 0.83 0.88 0.86 817
4 0.00 0.00 0.00 1
accuracy 0.81 1300
macro avg 0.54 0.53 0.53 1300
weighted avg 0.81 0.81 0.81 1300
Accuracy: 0.813076923076923
Cross Validation Score: 0.7810274175401196
# k-Nearest Neighbour (k-NN) implementation: using all features
#
from sklearn.neighbors import KNeighborsClassifier
#
# Finding the best combination of parameters to use:
model_kNN_all = KNeighborsClassifier()
# print(model_kNN_all.get_params())
params_ = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance'],
'metric': ('minkowski', 'chebyshev', 'euclidean')}]
kNN_all_p = GridSearchCV(model_kNN_all, params_, cv=4)
kNN_all_p.fit(x_all_train,y_all_train)
print(kNN_all_p.best_params_)
{'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'}
# k-Nearest Neighbour (k-NN) implementation: using all features
# Using the above combination of parameters in our implementation
kNN_all = KNeighborsClassifier(metric = 'minkowski', n_neighbors = 6, weights = 'distance')
kNN_all.fit(x_all_train, y_all_train)
pred_kNN_all = kNN_all.predict(x_all_test)
print(classification_report(y_all_test, pred_kNN_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_kNN_all))
cross_val_kNN_all = cross_val_score(estimator=kNN_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_kNN_all.mean())
#
precision recall f1-score support
2 0.74 0.68 0.71 482
3 0.82 0.86 0.84 817
4 0.00 0.00 0.00 1
accuracy 0.79 1300
macro avg 0.52 0.51 0.52 1300
weighted avg 0.79 0.79 0.79 1300
Accuracy: 0.7923076923076923
Cross Validation Score: 0.7966117427607036
# k-Nearest Neighbour (k-NN) implementation: using the subset of four selected features
#
# Finding the best combination of parameters to use:
model_kNN_sf = KNeighborsClassifier()
params_ = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance'],
'metric': ('minkowski', 'chebyshev', 'euclidean')}]
kNN_sf_p = GridSearchCV(model_kNN_sf, params_, cv=4)
kNN_sf_p.fit(x_all_sf_train,y_all_sf_train)
print(kNN_sf_p.best_params_)
{'metric': 'chebyshev', 'n_neighbors': 6, 'weights': 'distance'}
# k-Nearest Neighbour (k-NN) implementation: using the subset of four selected features
# Using the above combination of parameters in our implementation
kNN_sf = KNeighborsClassifier(metric = 'chebyshev', n_neighbors = 6, weights = 'distance')
kNN_sf.fit(x_all_sf_train, y_all_sf_train)
pred_kNN_sf = kNN_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_kNN_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_kNN_sf))
cross_val_kNN_sf = cross_val_score(estimator=kNN_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_kNN_sf.mean())
#
precision recall f1-score support
2 0.76 0.67 0.72 482
3 0.82 0.88 0.85 817
4 0.00 0.00 0.00 1
accuracy 0.80 1300
macro avg 0.53 0.52 0.52 1300
weighted avg 0.80 0.80 0.80 1300
Accuracy: 0.8007692307692308
Cross Validation Score: 0.7683263753182921
# Support Vector Machine (SVM) implementation: using all features
#
from sklearn.svm import SVC
#
svm_all = SVC(random_state = 111)
# print(svm_all.get_params())
svm_all.fit(x_all_train, y_all_train)
pred_svm_all = svm_all.predict(x_all_test)
print(classification_report(y_all_test, pred_svm_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_svm_all))
cross_val_svm_all = cross_val_score(estimator=svm_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_svm_all.mean())
#
precision recall f1-score support
2 0.74 0.60 0.66 482
3 0.78 0.87 0.83 817
4 0.00 0.00 0.00 1
accuracy 0.77 1300
macro avg 0.51 0.49 0.50 1300
weighted avg 0.77 0.77 0.76 1300
Accuracy: 0.77
Cross Validation Score: 0.7562077337596969
# Support Vector Machine (SVM) implementation: using the subset of four selected features
#
svm_sf = SVC(random_state = 111)
svm_sf.fit(x_all_sf_train,y_all_sf_train)
pred_svm_sf = svm_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_svm_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_svm_sf))
cross_val_svm_sf = cross_val_score(estimator=svm_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_svm_sf.mean())
#
precision recall f1-score support
2 0.72 0.57 0.63 482
3 0.77 0.87 0.82 817
4 0.00 0.00 0.00 1
accuracy 0.76 1300
macro avg 0.50 0.48 0.48 1300
weighted avg 0.75 0.76 0.75 1300
Accuracy: 0.7569230769230769
Cross Validation Score: 0.739081986143187
# We have implemented the four algorithms, Decision Tree, Random Forest, k-Nearest Neighbour (k-NN) and
# Support Vector Machine (SVM), first by using all the attributes in the dataset and then by using
# the subset of the four most-relevant selected features ('alcohol', 'density', 'volatile acidity' and 'chlorides').
#
# After reviewing the model performance of all the implemented iterations, we see that the performance indicators,
# when using the full set of features and when using the subset of the four selected features,
# is almost the same, for all the four algorithms.
#
# This would be consistent with our ANOVA (analysis of variance) findings,
# where we see that the scores of the other seven physicochemical features is very low as compared to these four features,
# hence a much lower impact on the ‘quality’ attribute.
#
# We also see that all of the four algorithms provide robust performance.
# The accuracy scores would be between the ranges, 75.384% and 82.846%.
# The lowest, 75.384% for the Decision Tree implementation, using the subset of four selected features and
# the highest, 82.846% for the Random Forest implementation, using all features in the sample set.
#